13 February 2018

Course material

What are we talking about?

  • Explore your dataset, graphically
  • Find relationship between variables
  • Find differences between groups

Loading the libraries

library("tidyverse")
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.3.4     ✔ dplyr   0.7.4
## ✔ tidyr   0.7.2     ✔ stringr 1.2.0
## ✔ readr   1.1.1     ✔ forcats 0.2.0
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Let's talk about your data

How you data should look like

alt text

Figure from http://r4ds.had.co.nz/tidy-data.html

Following three rules makes a dataset tidy: variables are in columns, observations are in rows, and values are in cells.

Loading the data

mydata <- read_csv("datasets/architect.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_integer(),
##   FileName = col_character(),
##   genotype = col_character(),
##   Time = col_integer(),
##   TNLR = col_integer(),
##   N2LR = col_integer(),
##   Magnitude = col_integer(),
##   Altitude = col_integer(),
##   ExtPathLength = col_integer()
## )
## See spec(...) for full column specifications.

Look at the struture of the data

str(mydata)
## Classes 'tbl_df', 'tbl' and 'data.frame':    647 obs. of  20 variables:
##  $ X1           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ FileName     : chr  "dense-3-1-12_1" "dense-3-1-12_1" "dense-3-1-12_1" "dense-3-1-12_1" ...
##  $ genotype     : chr  "dense" "dense" "dense" "dense" ...
##  $ Time         : int  4 5 6 7 8 9 10 11 12 3 ...
##  $ TRL          : num  118 236 591 1594 3661 ...
##  $ GRTR         : num  118 118 354 1004 2067 ...
##  $ L1R          : num  118 236 354 472 591 ...
##  $ GR1R         : num  118 118 118 118 118 ...
##  $ TNLR         : int  0 3 13 22 29 36 44 54 54 0 ...
##  $ TLRL         : num  0.00 5.72e-05 2.36e+02 1.12e+03 3.07e+03 ...
##  $ N2LR         : int  0 3 13 22 29 36 44 54 54 0 ...
##  $ L2LR         : num  0.00 5.72e-05 2.36e+02 1.12e+03 3.07e+03 ...
##  $ ML2LR        : num  0.00 1.91e-05 1.82e+01 5.10e+01 1.06e+02 ...
##  $ GR2L         : num  0.00 5.72e-05 2.36e+02 8.86e+02 1.95e+03 ...
##  $ D2LR         : num  0 0.0127 0.0367 0.0466 0.0491 ...
##  $ Height       : num  116 231 348 465 581 ...
##  $ Width        : num  20.6 43.9 107 168.8 233 ...
##  $ Magnitude    : int  1 4 14 23 30 37 45 55 55 1 ...
##  $ Altitude     : int  1 4 14 23 30 37 45 55 55 1 ...
##  $ ExtPathLength: int  1 13 118 298 494 739 1079 1594 1594 1 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 20
##   .. ..$ X1           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ FileName     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ genotype     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ Time         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ TRL          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ GRTR         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ L1R          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ GR1R         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ TNLR         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ TLRL         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ N2LR         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ L2LR         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ ML2LR        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ GR2L         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ D2LR         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Height       : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Width        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Magnitude    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ Altitude     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ ExtPathLength: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

Filter by row

Use dplyr to filter data based on specific values. %>% is called a pipe and allows you to queue up operations.

mydata %>%
  filter(genotype == "dense") %>%
  head()
## # A tibble: 6 x 20
##      X1       FileName genotype  Time       TRL      GRTR      L1R
##   <int>          <chr>    <chr> <int>     <dbl>     <dbl>    <dbl>
## 1     1 dense-3-1-12_1    dense     4  118.1103  118.1103 118.1103
## 2     2 dense-3-1-12_1    dense     5  236.2205  118.1103 236.2205
## 3     3 dense-3-1-12_1    dense     6  590.5513  354.3308 354.3307
## 4     4 dense-3-1-12_1    dense     7 1594.4887 1003.9373 472.4410
## 5     5 dense-3-1-12_1    dense     8 3661.4180 2066.9294 590.5511
## 6     6 dense-3-1-12_1    dense     9 6259.8433 2598.4253 708.6613
## # ... with 13 more variables: GR1R <dbl>, TNLR <int>, TLRL <dbl>,
## #   N2LR <int>, L2LR <dbl>, ML2LR <dbl>, GR2L <dbl>, D2LR <dbl>,
## #   Height <dbl>, Width <dbl>, Magnitude <int>, Altitude <int>,
## #   ExtPathLength <int>

Select by columns

You can also select, or drop specific columns using the select verb.

mydata %>%
  select(c(FileName, genotype, Time, Height, Width))%>%
  head()
## # A tibble: 6 x 5
##         FileName genotype  Time   Height     Width
##            <chr>    <chr> <int>    <dbl>     <dbl>
## 1 dense-3-1-12_1    dense     4 115.9124  20.61023
## 2 dense-3-1-12_1    dense     5 231.4529  43.94336
## 3 dense-3-1-12_1    dense     6 348.0965 106.99701
## 4 dense-3-1-12_1    dense     7 465.2197 168.76273
## 5 dense-3-1-12_1    dense     8 581.1602 232.97046
## 6 dense-3-1-12_1    dense     9 698.4634 289.51532

Select by columns

You can also select, or drop specific columns using the select verb.

mydata <- mydata %>%
  select(-c(X1))

mydata %>%
  head()
## # A tibble: 6 x 19
##         FileName genotype  Time       TRL      GRTR      L1R     GR1R
##            <chr>    <chr> <int>     <dbl>     <dbl>    <dbl>    <dbl>
## 1 dense-3-1-12_1    dense     4  118.1103  118.1103 118.1103 118.1103
## 2 dense-3-1-12_1    dense     5  236.2205  118.1103 236.2205 118.1102
## 3 dense-3-1-12_1    dense     6  590.5513  354.3308 354.3307 118.1103
## 4 dense-3-1-12_1    dense     7 1594.4887 1003.9373 472.4410 118.1102
## 5 dense-3-1-12_1    dense     8 3661.4180 2066.9294 590.5511 118.1102
## 6 dense-3-1-12_1    dense     9 6259.8433 2598.4253 708.6613 118.1102
## # ... with 12 more variables: TNLR <int>, TLRL <dbl>, N2LR <int>,
## #   L2LR <dbl>, ML2LR <dbl>, GR2L <dbl>, D2LR <dbl>, Height <dbl>,
## #   Width <dbl>, Magnitude <int>, Altitude <int>, ExtPathLength <int>

Create new variables

And you can create new variables using the mutate verb.

mydata %>%
  mutate(newvar = log(TRL))%>%
  select(c(genotype, Time, newvar)) %>%
  head()
## # A tibble: 6 x 3
##   genotype  Time   newvar
##      <chr> <int>    <dbl>
## 1    dense     4 4.771619
## 2    dense     5 5.464766
## 3    dense     6 6.381057
## 4    dense     7 7.374308
## 5    dense     8 8.205606
## 6    dense     9 8.741910

More about data manipulation:

Let's have a look at the data

What is ggplot ?

Used to produce statistical graphics, main developer = Hadley Wickham

attempt to take the good things about base and lattice graphics and improve on them with a strong, underlying model "

based on The Grammar of Graphics by Leland Wilkinson, 2005

describes the meaning of what we do when we construct statistical graphics … More than a taxonomy … Computational system based on the underlying mathematics of representing statistical functions of data.

  • does not limit developer to a set of pre-specified graphics
  • adds some concepts to grammar which allow it to work well with R

ggplot components

  • data: in ggplot2, data must be stored as an R data frame
  • coordinate system: describes 2-D space that data is projected onto
  • geoms: describe type of geometric objects that represent data
  • aesthetics: describe visual characteristics that represent data
  • scales: for each aesthetic, describe how visual characteristic is converted to display values
  • stats: describe statistical transformations that typically summarize data
  • facets: describe how data is split into subsets and displayed as multiple small graphs

data and aestetic

We first create the plot, by setting the data and the aestetic.

myplot <- ggplot(data=mydata, aes(x=TRL, y=TNLR))
myplot

Adding a geometry - Points

We need to add a geom to display the plot. Different geom can be used.

myplot + 
  geom_point()

Adding a geometry - Lines

We need to add a geom to display the plot. Different geom can be used.

myplot + 
  geom_line()

Adding a geometry - steps

We need to add a geom to display the plot. Different geom can be used.

myplot + 
  geom_step()

Combining geoms

The advantage of using a layered approach, is that the layers can be combined. For instance, several geom can be used in the same plot.

myplot + 
  geom_point() + geom_step()

Using discrete categories

Let's use the different categories we have. For this, we add a colour argument in the aes. ggplot will automatically pick a discrete color scale.

ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) +
  geom_point()

Using continuous categories

Let's use one of the variable as a continuous caterogy. For this, we add a colour argument in the aes. ggplot will automatically pick a continuous color scale.

ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=Height)) +
  geom_point()

Using two categories

When using multiple categories, we can both use define different colors (colour) and point styles (shape) in the aes argument.

ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=Time, shape=genotype)) +
  geom_point()

Removing chart junk

ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=Time, shape=genotype)) +
  geom_point() + 
  theme_classic()

Using facets

Facets can be used to split the data and present them side to side.

ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) +
  geom_point() + 
  facet_wrap(~ genotype)

Adding stats tot the plots

ggplot has some built-in stat functions that can be directly used in the plots.

ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) + 
  geom_point() + 
  geom_smooth()
## `geom_smooth()` using method = 'loess'

Adding stats tot the plots

ggplot has some built-in stat functions that can be directly used in the plots.

ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) + 
  geom_point() + 
  geom_smooth(se = FALSE)

Adding stats tot the plots

ggplot has some built-in stat functions that can be directly used in the plots.

ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) + 
  geom_point() + 
  geom_smooth(se = FALSE, method="lm")

Adding stats tot the plots

ggplot has some built-in stat functions that can be directly used in the plots.

ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) + 
  geom_point() + 
  stat_ellipse()

Boxplot

ggplot(data=mydata, aes(x=genotype, y=TRL)) + 
  geom_boxplot()

View everything at once

Now, can also try to see the data at once. For this, we need to play a bit with the initial data, to move from a wide to long format.

mydatalong <- mydata %>%
  gather(TRL:ExtPathLength, key = "variable", value = "value")

mydatalong %>%
  head()
## # A tibble: 6 x 5
##         FileName genotype  Time variable     value
##            <chr>    <chr> <int>    <chr>     <dbl>
## 1 dense-3-1-12_1    dense     4      TRL  118.1103
## 2 dense-3-1-12_1    dense     5      TRL  236.2205
## 3 dense-3-1-12_1    dense     6      TRL  590.5513
## 4 dense-3-1-12_1    dense     7      TRL 1594.4887
## 5 dense-3-1-12_1    dense     8      TRL 3661.4180
## 6 dense-3-1-12_1    dense     9      TRL 6259.8433

View everything at once

# Make a lin plot, for each variable
mydatalong %>%
  filter(Time == max(Time)) %>%   # Select just one time point (the last)
  ggplot(aes(genotype, value, colour=genotype)) + 
  geom_boxplot() + 
  facet_wrap(~variable, nrow=2)+ 
  theme(text = element_text(size=9))

View everything at once

View everything at once

# Make a lin plot, for each variable
mydatalong %>%
  filter(Time == max(Time)) %>%   # Select just one time point (the last)
  ggplot(aes(genotype, value, colour=genotype)) + 
  geom_boxplot() + 
  facet_wrap(~variable, nrow=2, scales = "free")+ 
  theme(text = element_text(size=9))

View everything at once

Interact with your data

Using the plotly library

Plotly creates leading open source tools for composing, editing, and sharing interactive data visualization via the Web.

https://plot.ly/

library("plotly")

plotly works great with ggplot

Any type of ggplot can be wrapped up in a plotly figure

pl <- ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype)) +
  geom_point()

ggplotly(pl)

plotly works great with ggplot

We can choose what type of label we want to see

pl <- ggplot(data=mydata, aes(x=TRL, y=TNLR, colour=genotype, label=FileName)) +
  geom_point()

ggplotly(pl)

plotly works great with ggplot

pl <- mydata %>%
  filter(Time == max(Time)) %>%
  ggplot(aes(x=genotype, y=TRL, label=FileName)) +
  geom_boxplot()

ggplotly(pl )

More info